import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from pandas.plotting import autocorrelation_plot
from pandas.plotting import scatter_matrix
import seaborn as sns; sns.set()
sns.set(rc={'figure.figsize':(11.7,8.27)})
df_raw = pd.read_csv("metrics.csv", header=None, dtype=np.float, na_values=-1.0).transpose()
print("Analysing & preprocessing imported data...\n")
print("Number of time series: " + str(df_raw.columns.size))
print("Number of 1 hour time steps: " + str(df_raw.index.size))
thresh_percent = 0.80
thresh_num = int(df_raw.index.size * thresh_percent)
print("\nDropping time series with more than " + str(thresh_percent * 100) + "% " + "absent values.")
df = df_raw.dropna(axis='columns', thresh=thresh_num)
zeros = []
for col in df:
if(df[col].sum() == 0):
zeros.append(col)
df = df.drop(columns=col)
print("\nDropping time series with only zero values: " + str(zeros))
print("Number of time series after drop: " + str(df.columns.size))
# interpolate time-series with missing values using cubic splines
def get_na_ticks(dframe):
na_rows = []
print("\nChecking dataframe with " + str(df_raw.index.size) + " ticks.")
for index, row in dframe.iterrows():
if(row.count() < (0.05 * dframe.index.size)): na_rows.append(index)
print("Time steps with less than 95% valid values: " + str(na_rows))
return na_rows
na_rows = get_na_ticks(df)
df = df.interpolate(method='cubic')
for row in na_rows:
title_str = "Area around time step " + str(row) + " with "
df_raw.iloc[row-8:row+10].plot(linewidth=0.7, title=title_str + "missing values", legend=None,figsize=(20,8))
df.iloc[row-10:row+10].plot(linewidth=0.7, title=title_str + "interpolated values", legend=None,figsize=(20,8))
df.describe()
df.plot(linewidth=1, alpha=0.75, legend=None, figsize=(20,8), title="All time series")
df.plot(linewidth=1, alpha=0.75, legend=None, figsize=(20,8), title="Daily periodicity of dataset", c='b')
for step in range(df.index.size):
if(step%24==0): plt.axvline(x=step, c='r')
# Filtering high, medium and low peak time series
h = []
m = []
l = []
n = []
for col in df:
if(df[col].min() < 0.0): n.append(col)
elif(df[col].max() > 1.0): h.append(col)
elif(df[col].max() > 0.25): m.append(col)
else: l.append(col)
print("Number of time series with high peaks: " + str(len(h)))
print("Number of time series with medium peaks: " + str(len(m)))
print("Number of time series with low peaks: " + str(len(l)))
fig, ax = plt.subplots(figsize=(20,8))
ax.set_label("Grouping of time series by peaks")
plt.plot(df[h], c='#f48f42')
plt.plot(df[m], c='#4158f4')
plt.plot(df[l], c='#1acc49')
plt.show()
fig, ax = plt.subplots(figsize=(20,8))
ax.set_label("Grouping of time series that take negative values")
plt.plot(df[n])
plt.show()
plt.close()
# Normalize time series of high correlating data
from sklearn.preprocessing import MinMaxScaler
def normalize_dframe(dframe):
df_norm = pd.DataFrame()
for col in dframe:
# prepare data for normalization
values = df[col].values
values = values.reshape((len(values), 1))
# train the normalization
scaler = MinMaxScaler(feature_range=(0, 1))
scaler = scaler.fit(values)
normalized = scaler.transform(values)
df_norm[col]= normalized.flatten()
return df_norm
def rollmean_dframe(dframe, window):
return dframe.rolling(window=window).mean()
def apply_offset_mean(dframe):
offset_vert = pd.DataFrame()
for col in dframe:
offset_vert[col] = dframe[col] - dframe[col].mean()
return offset_vert
def get_mean_series(dframe):
means = []
for row in dframe.index:
means.append(dframe.iloc[row].mean())
return pd.Series(data=means)
df_norm = normalize_dframe(df[l])
rolled = rollmean_dframe(df_norm, 24)
rolled_offset = apply_offset_mean(rolled)
mean_line = get_mean_series(rolled_offset)
rolled.plot(linewidth=0.7, legend=None, figsize=(20,8), title="Normalized data with rolling mean (24 hour window)")
plt.figure(figsize=(20,8))
plt.plot(rolled_offset,linewidth=0.7, alpha=0.75)
plt.plot(mean_line, linewidth=3, c='r')
plt.show()
def calc_dist_to_series(dframe, series):
distances = pd.DataFrame()
for col in dframe:
distances[col] = (dframe[col] - series)
return distances.abs()
distances = calc_dist_to_series(rolled_offset, mean_line)
distances.plot(linewidth=0.7, legend=None, figsize=(20,8), title="Distances to mean line")
high_dist = []
ts_first = distances.iloc[:625]
ts_last = distances.iloc[625:]
for col in distances:
if(ts_first[col].max()>0.15): high_dist.append(col)
elif(ts_last[col].max()>0.22): high_dist.append(col)
rest = [x for x in distances.columns if x not in high_dist]
rolled_offset[high_dist].plot(linewidth=0.7, legend=None, figsize=(20,8), title="Time series with high distance")
rolled_offset[rest].plot(linewidth=0.7, legend=None, figsize=(20,8), title="Remaining time series fitting the mean line")
# Summarizing current results
groups = {
'high': (h + m),
'outliers': (high_dist + n),
'main': (rest),
}
Since this is just a broad filtering, a look at the correlation between the time series inside the groups can show us how good the mean filtering was.
fig, axes = plt.subplots(2, 2, figsize=(20,10))
def create_corr_heatmap(dframe, x, y, title):
ax = sns.heatmap(
data=dframe.corr('pearson'),
vmin=0, vmax=1, center=0.5,
cmap=sns.diverging_palette(20, 220, n=200),
ax=axes[x][y]
)
ax.set(title=title)
create_corr_heatmap(df, 0, 0, "All time series")
create_corr_heatmap(df[groups['high']], 0, 1, "Time series with high peaks")
create_corr_heatmap(df[groups['outliers']], 1, 0, "Outliers")
create_corr_heatmap(df[groups['main']], 1, 1, "Main Group")
plt.show()
# initially putting all non correlating series into outliers
for group in groups:
if(group != 'outliers'):
corr = df[groups[group]].corr('pearson')
for col in corr:
#print(corr[col].sort_values()[int(0.80*len(corr[col])):len(corr[col])-1])
if(corr[col].sort_values()[int(0.75*len(corr[col])):len(corr[col])-1].mean() < 0.85):
groups[group].remove(col)
groups['outliers'].append(col)
# creating a real outlier set based on current outliers
real_outliers = []
corr = df[groups['outliers']].corr('pearson')
for col in corr:
if(corr[col].sort_values()[:int(0.1*len(corr[col]))].mean() < 0.15):
groups['outliers'].remove(col)
real_outliers.append(col)
fig, axes = plt.subplots(2, 2, figsize=(20,10))
create_corr_heatmap(df[real_outliers], 0, 0, "Real outliers")
create_corr_heatmap(df[groups['high']], 0, 1, "Time series with high peaks")
create_corr_heatmap(df[groups['outliers']], 1, 0, "Remaining outliers")
create_corr_heatmap(df[(groups['main']+groups['high'])], 1, 1, "Main Group")
plt.show()
from scipy.cluster import hierarchy as hc
# High
fig, ax = plt.subplots(figsize=(20,10))
ax.set_title("high peak series")
dframe = df[groups['high']]
corr = 1-dframe.corr()
corr = corr.fillna(0.0)
corr_condensed = hc.distance.squareform(corr)
z_high = hc.linkage(corr_condensed, method='average')
plt.axhline(y=0.16, c='black')
dendrogram = hc.dendrogram(z_high, labels=corr.columns)
plt.show()
# Main
fig, ax = plt.subplots(figsize=(20,10))
ax.set_title('main group')
dframe = df[groups['main']]
corr = 1-dframe.corr()
corr = corr.fillna(0.0)
corr_condensed = hc.distance.squareform(corr)
z_main = hc.linkage(corr_condensed, method='average')
plt.axhline(y=0.2, c='black')
dendrogram = hc.dendrogram(z_main, labels=corr.columns)
plt.show()
# We cut both trees into two clusters each
def get_clusters(z, clusters, group):
cut_tree = hc.cut_tree(z, n_clusters=clusters)
clustered_series = []
for cluster in range(0, clusters):
bin_cols = []
for idx, col in enumerate(group):
if(cut_tree[idx][0] == cluster):
bin_cols.append(col)
clustered_series.append(bin_cols)
return clustered_series
cluster_high = get_clusters(z_high, 2, groups['high'])
cluster_main = get_clusters(z_main, 2, groups['main'])
print('high cluster 1:\n' + str(cluster_high[0]))
print('\nhigh cluster 2:\n' + str(cluster_high[1]))
print('\nmain cluster 1:\n' + str(cluster_main[0]))
print('\nmain cluster 2:\n' + str(cluster_main[1]))
# New summary
groups_new = {
'cluster1': (cluster_high[0] + cluster_main[1]),
'cluster2': (cluster_main[0]),
'remaining': (groups['outliers'] + cluster_high[1]),
'outliers': real_outliers
}
fig, axes = plt.subplots(2, 2, figsize=(20,10))
create_corr_heatmap(df[groups_new['cluster1']], 0, 0, "Cluster 1")
create_corr_heatmap(df[groups_new['cluster2']], 0, 1, "Cluster 2")
create_corr_heatmap(df[groups_new['remaining']], 1, 0, "Remaining time series")
create_corr_heatmap(df[groups_new['outliers']], 1, 1, "Outliers")
plt.show()
#z-score calc
cols = list(df.columns)
df_zscores = pd.DataFrame()
zlow = []
znormal = []
zhigh = []
def calc_zscore(dframe):
zscores = pd.DataFrame()
for col in dframe:
zscores[col] = (df[col] - df[col].mean())/df[col].std()
return zscores
df_rolled = rollmean_dframe(df[groups_new['remaining']],24)
zscores = calc_zscore(df_rolled)
high_z = []
medium_z = []
low_z = []
for col in zscores:
max_score = np.absolute(zscores[col].max())
min_score = np.absolute(zscores[col].min())
diff = max_score - min_score
if(diff > 5): high_z.append(col)
elif(diff > 2): medium_z.append(col)
else: low_z.append(col)
fig, axes = plt.subplots(2, 2, figsize=(20,10))
create_corr_heatmap(df[groups_new['remaining']], 0, 0, "All remaining")
create_corr_heatmap(df[high_z], 0, 1, "High zscores")
create_corr_heatmap(df[medium_z], 1, 0, "Medium zscores")
create_corr_heatmap(df[low_z], 1, 1, "Low zscores")
plt.show()
print(groups_new['cluster1'])
print('\nLength: ' + str(len(groups_new['cluster1'])))
df[groups_new['cluster1']].plot(linewidth=0.7, legend=None, figsize=(20,8), title="Time series of Cluster 1")
print(groups_new['cluster2'])
print('\nLength: ' + str(len(groups_new['cluster2'])))
df[groups_new['cluster2']].plot(linewidth=0.7, legend=None, figsize=(20,8), title="Time series of Cluster 2")
print(groups_new['remaining'])
print('\nLength: ' + str(len(groups_new['remaining'])))
df[groups_new['remaining']].plot(linewidth=0.7, legend=None, figsize=(20,8), title="Remaining time series that need further clustering ")
print(groups_new['outliers'])
print('\nLength: ' + str(len(groups_new['outliers'])))
df[groups_new['outliers']].plot(linewidth=0.7, legend=None, figsize=(20,8), title="Outliers")